In [8]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import string
In [9]:
import nltk                       #for working with human language data & text cleaning
In [10]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
In [11]:
s_df=pd.read_csv("C:\\Users\\djo16\\Cognifyz\\Dataset-copy(1).csv")
print(s_df.head())
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong City   
3      SM Megamall, Ortigas, Mandaluyong City   
4      SM Megamall, Ortigas, Mandaluyong City   

                                    Locality Verbose   Longitude   Latitude  \
0  Century City Mall, Poblacion, Makati City, Mak...  121.027535  14.565443   
1  Little Tokyo, Legaspi Village, Makati City, Ma...  121.014101  14.553708   
2  Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...  121.056831  14.581404   
3  SM Megamall, Ortigas, Mandaluyong City, Mandal...  121.056475  14.585318   
4  SM Megamall, Ortigas, Mandaluyong City, Mandal...  121.057508  14.584450   

                           Cuisines  ...          Currency Has Table booking  \
0        French, Japanese, Desserts  ...  Botswana Pula(P)               Yes   
1                          Japanese  ...  Botswana Pula(P)               Yes   
2  Seafood, Asian, Filipino, Indian  ...  Botswana Pula(P)               Yes   
3                   Japanese, Sushi  ...  Botswana Pula(P)                No   
4                  Japanese, Korean  ...  Botswana Pula(P)               Yes   

  Has Online delivery Is delivering now Switch to order menu Price range  \
0                  No                No                   No           3   
1                  No                No                   No           3   
2                  No                No                   No           4   
3                  No                No                   No           4   
4                  No                No                   No           4   

   Aggregate rating  Rating color Rating text Votes  
0               4.8    Dark Green   Excellent   314  
1               4.5    Dark Green   Excellent   591  
2               4.4         Green   Very Good   270  
3               4.9    Dark Green   Excellent   365  
4               4.8    Dark Green   Excellent   229  

[5 rows x 21 columns]

Task 1: Restraunt Reviews¶

Analyze the text reviews to identify the mostcommon positive and negative keywords.

In [11]:
ratings = s_df['Rating text']
ratings
Out[11]:
0       Excellent
1       Excellent
2       Very Good
3       Excellent
4       Excellent
          ...    
9546    Very Good
9547    Very Good
9548         Good
9549    Very Good
9550    Very Good
Name: Rating text, Length: 9551, dtype: object
In [13]:
nltk.download('punkt')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\djo16\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\djo16\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[13]:
True
In [15]:
nltk.download('vader_lexicon')
s_i_a= SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\djo16\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
In [17]:
positive_words =[]
negative_words =[]
In [21]:
for rating_text in ratings:
    
    tokens= word_tokenize(rating_text.lower())
    tokens=[token for token in tokens if token.isalpha() and token not in stop_words]
    sentiment_score= s_i_a.polarity_scores(rating_text)['compound']
 

if sentiment_score >= 0.01:
    positive_words.extend(tokens)
elif sentiment_score <= 0.01:
    negative_words.extend(tokens)
    
In [23]:
# most common
common_positives= Counter(positive_words)
common_negatives=Counter(negative_words)
In [25]:
num_top_keywords = 10
print('\nTop positive:')
for keyword, count in common_positives.most_common(num_top_keywords):
    print(f"{keyword}:{count} times")
Top positive:
good:1 times
In [27]:
print("Top negative keywords : ")
for word, count in common_negatives.most_common(num_top_keywords):
    print(f"{word}: {count} times")
Top negative keywords : 

Calculate the average length of reviews andexplore if there is a relationship betweenreview length and rating.

In [30]:
df_explore = s_df[['Rating text','Aggregate rating']].copy()
df_explore['Review Length'] = df_explore['Rating text'].apply(lambda x: len(str(x)))
avg_review_lengths = df_explore.groupby('Aggregate rating')['Review Length'].mean()
print(avg_review_lengths)
Aggregate rating
0.0    9.0
1.8    4.0
1.9    4.0
2.0    4.0
2.1    4.0
2.2    4.0
2.3    4.0
2.4    4.0
2.5    7.0
2.6    7.0
2.7    7.0
2.8    7.0
2.9    7.0
3.0    7.0
3.1    7.0
3.2    7.0
3.3    7.0
3.4    7.0
3.5    4.0
3.6    4.0
3.7    4.0
3.8    4.0
3.9    4.0
4.0    9.0
4.1    9.0
4.2    9.0
4.3    9.0
4.4    9.0
4.5    9.0
4.6    9.0
4.7    9.0
4.8    9.0
4.9    9.0
Name: Review Length, dtype: float64
In [32]:
plt.figure(figsize=(10,6))
avg_review_lengths.plot(kind='bar',color='purple')
plt.title('Average Review Length of Reviews')
plt.xlabel('Aggregate rating')
plt.ylabel('Average review length character')
plt.show()
No description has been provided for this image

Task 2: Votes Analysis¶

Identify the restaurants with the highest andlowest number of votes.

In [36]:
df_v = s_df.dropna(subset=['Votes', 'Restaurant Name'])
df_v= s_df[['Votes', 'Restaurant Name']]
df_v
Out[36]:
Votes Restaurant Name
0 314 Le Petit Souffle
1 591 Izakaya Kikufuji
2 270 Heat - Edsa Shangri-La
3 365 Ooma
4 229 Sambo Kojin
... ... ...
9546 788 Naml۱ Gurme
9547 1034 Ceviz A��ac۱
9548 661 Huqqa
9549 901 A���k Kahve
9550 591 Walter's Coffee Roastery

9551 rows × 2 columns

In [44]:
print("Restaurant(s) with highest votes:")
max_votes = df_v[df_v['Votes'] == df_v['Votes'].max()]
max_votes
Restaurant(s) with highest votes:
Out[44]:
Votes Restaurant Name
728 10934 Toit
In [46]:
print("Restaurant with lowest votes:")
min_votes=df_v.loc[df_v['Votes'].idxmin()]
min_votes
Restaurant with lowest votes:
Out[46]:
Votes                             0
Restaurant Name    Cantinho da Gula
Name: 69, dtype: object
In [ ]:
 

Analyze if there is a correlation between thenumber of votes and the rating of arestaurant.

In [65]:
print(s_df.columns.tolist())
['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes']
In [63]:
correlation = s_df['Votes'].corr(s_df["Aggregate rating"])
print(f"Correlation between Votes and Rating: {correlation:.2f}")
Correlation between Votes and Rating: 0.31
In [69]:
fig = px.scatter(s_df,
                 x='Votes',
                 y='Aggregate rating',
                 title='Votes vs Aggregate Rating',
                 labels={'Votes': 'Number of Votes',
                         'Aggregate rating': 'Aggregate Rating'},
                width=1100, height = 800 )
fig.show()

Task 3: Price Range vs Online Delivery & Table Booking¶

Analyze if there is a relationship between the price range and the availability of online delivery and table booking.

In [16]:
s_df.head(5)
Out[16]:
Restaurant ID Restaurant Name Country Code City Address Locality Locality Verbose Longitude Latitude Cuisines ... Currency Has Table booking Has Online delivery Is delivering now Switch to order menu Price range Aggregate rating Rating color Rating text Votes
0 6317637 Le Petit Souffle 162 Makati City Third Floor, Century City Mall, Kalayaan Avenu... Century City Mall, Poblacion, Makati City Century City Mall, Poblacion, Makati City, Mak... 121.027535 14.565443 French, Japanese, Desserts ... Botswana Pula(P) Yes No No No 3 4.8 Dark Green Excellent 314
1 6304287 Izakaya Kikufuji 162 Makati City Little Tokyo, 2277 Chino Roces Avenue, Legaspi... Little Tokyo, Legaspi Village, Makati City Little Tokyo, Legaspi Village, Makati City, Ma... 121.014101 14.553708 Japanese ... Botswana Pula(P) Yes No No No 3 4.5 Dark Green Excellent 591
2 6300002 Heat - Edsa Shangri-La 162 Mandaluyong City Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal... Edsa Shangri-La, Ortigas, Mandaluyong City Edsa Shangri-La, Ortigas, Mandaluyong City, Ma... 121.056831 14.581404 Seafood, Asian, Filipino, Indian ... Botswana Pula(P) Yes No No No 4 4.4 Green Very Good 270
3 6318506 Ooma 162 Mandaluyong City Third Floor, Mega Fashion Hall, SM Megamall, O... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.056475 14.585318 Japanese, Sushi ... Botswana Pula(P) No No No No 4 4.9 Dark Green Excellent 365
4 6314302 Sambo Kojin 162 Mandaluyong City Third Floor, Mega Atrium, SM Megamall, Ortigas... SM Megamall, Ortigas, Mandaluyong City SM Megamall, Ortigas, Mandaluyong City, Mandal... 121.057508 14.584450 Japanese, Korean ... Botswana Pula(P) Yes No No No 4 4.8 Dark Green Excellent 229

5 rows × 21 columns

In [18]:
print(s_df['Price range'].unique())
[3 4 2 1]
In [20]:
print(s_df['Has Online delivery'].unique())
print(s_df['Has Table booking'].unique())
['No' 'Yes']
['Yes' 'No']
In [41]:
s_df['Has Online delivery'] = s_df['Has Online delivery'].map({'Yes': 1, 'No': 0})
s_df['Has Table booking'] = s_df['Has Table booking'].map({'Yes': 1, 'No': 0})
In [43]:
new_table = s_df.groupby('Price range')[['Has Online delivery', 'Has Table booking']].mean()
In [ ]:
new_table = new_table.rename(columns={
    'Has Online delivery binary': 'Has Online delivery',
    'Has Table booking binary': 'Has Table booking'
})
In [47]:
new_table = new_table.reset_index()
new_table
Out[47]:
index Price range Has Online delivery Has Table booking
0 0 1 0.157741 0.000225
1 1 2 0.413106 0.076775
2 2 3 0.291903 0.457386
3 3 4 0.090444 0.467577
In [49]:
fig = px.bar(new_table,
             x='Price range',
             y=['Has Online delivery', 'Has Table booking'],
             barmode='group',
             title='Average Online Delivery and Table Booking by Price Range',
             labels={'Price range': 'Price Range',
                     'Has Online delivery': 'Avg. Online Delivery (1=Yes, 0=No)',
                     'Has Table booking': 'Avg. Table Booking (1=Yes, 0=No)'})
fig.show()
In [53]:
delivery=pd.crosstab(s_df['Price range'], s_df['Has Online delivery'])
booking = pd.crosstab(s_df['Price range'], s_df['Has Table booking'])
print (delivery ,'\n' , booking)
Has Online delivery     0     1
Price range                    
1                    3743   701
2                    1827  1286
3                     997   411
4                     533    53 
 Has Table booking     0    1
Price range                 
1                  4443    1
2                  2874  239
3                   764  644
4                   312  274
In [55]:
combined = pd.concat(
    [delivery.rename(columns={0: 'No_Online', 1: 'Yes_Online'}),
     booking.rename(columns={0: 'No_Booking', 1: 'Yes_Booking'})],
    axis=1
)
print("\nCombined Cross Tab:", combined)
Combined Cross Tab:              No_Online  Yes_Online  No_Booking  Yes_Booking
Price range                                                
1                 3743         701        4443            1
2                 1827        1286        2874          239
3                  997         411         764          644
4                  533          53         312          274
In [57]:
combined['Price range'] = ['Low', 'Medium', 'High', 'Very High']  
combined = combined[['Price range', 'No_Online', 'Yes_Online', 'No_Booking', 'Yes_Booking']]
In [59]:
combined_1 = pd.melt(
    combined,
    id_vars='Price range',
    value_vars=['No_Online', 'Yes_Online', 'No_Booking', 'Yes_Booking'],
    var_name='Service_Status',
    value_name='Count'
)

fig = px.bar(
    combined_1,
    x='Price range',
    y='Count',
    color='Service_Status',
    barmode='group',
    title='Availability of Online Delivery and Table Booking by Price Range'
)
fig.show()

Determine if higher-priced restaurants aremore likely to offer these services.

In [62]:
# Plot Online Delivery vs Price Range
sns.barplot(x=new_table.index, y=new_table['Has Online delivery'])
plt.title("Online Delivery by Price Range")
plt.ylabel("Proportion Offering Online Delivery")
plt.xlabel("Price Range")
plt.show()

# Plot Table Booking vs Price Range
sns.barplot(x=new_table.index, y=new_table['Has Table booking'])
plt.title("Table Booking by Price Range")
plt.ylabel("Proportion Offering Table Booking")
plt.xlabel("Price Range")
plt.show()
No description has been provided for this image
No description has been provided for this image
In [64]:
combined_1 = pd.melt(
    combined,
    id_vars='Price range',
    value_vars=['No_Online', 'Yes_Online', 'No_Booking', 'Yes_Booking'],
    var_name='Service_Status',
    value_name='Count'
)

fig = px.bar(
    combined_1,
    x='Price range',
    y='Count',
    color='Service_Status',
    barmode='group',
    title='Availability of Online Delivery and Table Booking by Price Range'
)
fig.show()
In [ ]: